# This script makes a crop-level
# data set with decadal extreme-heat shocks

# It also makes a crop by county dataset that
# is used by "make_county_shocks.py" to make the 
# county-level shocks

# Jacob Moscona and Karthik Sastry
# This version: last edited on September 28, 2022

import pandas as pd
import numpy as np
import os


##
## Options
## 

## Directory for both raw data, and data outputs
wd = '/home/karthik/Dropbox (MIT)/climate_crops/QJE_Submission/Replication/Data/'
os.chdir(wd)

## What areas to use
## (Default, if both are false, is to use 1959 Census of Ag)
use_2012_area = False # 2012 census of agriculture
use_1955_area = False # 1955 census of agriculture


## Calculate shocks in the future using climatic projections?
use_projection = False
## If true, the following options set...
proj_year = 2090 # what future year to consider
path = '85' # what emissions pathway


## Calculate shocks for one part of the US?
## 0 = all; 1 = east of 100 meridian; 2 = west of 100 meridian
geo_switch = 0


##
## Load raw data and apply options
##

## These filenames will update, so they are informed by what 
## options were chosen

name_crops = 'crop_shocks'
name_counties = 'crop_x_county_shocks'


## Load data
file_crops = 'crop_database_raw.csv' # Crop names and cutoff temperatures
file_areas = 'county_areas_raw.csv' # Areas for each crop, by county
if use_projection == False:
    file_ddays = 'dday_database_raw.csv' # Degree days, by county
else:
    file_ddays = 'dday_database' + str(path) + '_' + str(proj_year) + '_raw.csv' # Same, but with the projection added
    name_crops += '_' + path + '_' + str(proj_year)
    name_counties += '_' + path + '_' + str(proj_year)

ag_data = pd.read_csv(file_areas)
cr_data = pd.read_csv(file_crops)
dd_data = pd.read_csv(file_ddays)


## Restrict to one part of the USA, if desired
if geo_switch == 1: # east = long < 100
    east_counties = ag_data['lon'] <= 100.0
    ag_data = ag_data.loc[east_counties,:]
    name_crops += '_east'
    name_counties += '_east'
elif geo_switch == 2: # west = long > 100
    west_counties = ag_data['lon'] > 100.0
    ag_data = ag_data.loc[west_counties,:]
    name_crops += '_west'
    name_counties += '_west'


## Tell script what areas to use when
## taking averages
if use_2012_area:
    name_crops += '_2012a'
    name_counties += '_2012a'
    colname = 'area2012'
elif use_1955_area:
    name_crops += '_1955a'
    name_counties += '_1955a'
    colname = 'area_1955'
else:
    colname = 'area'
    
## What years to use for calculation
if use_projection:
    yr_range = [2010,proj_year] 
else:
    yr_range = range(1950,2020,10) # Decades 1950 to 2010, inclusive


    

##
## Main Calculation
##

ncounty = ag_data.shape[0] # Number of counties
area_columns = [x for x in ag_data.columns if 'area' in x]
cnty_out = ag_data[['STATE_FIPS','CNTY_FIPS']].copy() # County level data frame
crop_out = cr_data.copy() # Crop-level data to export, which adds to the previous data
crop_out = crop_out.set_index('crop_id') # Use the numerical indexes, which match the other files

# What temperature cutoffs to use
temps = ('min_opt_temp','max_opt_temp')
# and how to name these in the variable list
names = ('minOpt','maxOpt')

for icrop in crop_out.index:
    
    for year in yr_range:
        
        ag_copy = ag_data.copy()
        ag_copy = pd.merge(ag_copy, dd_data.loc[dd_data.decade == year,:], 
                           how ='left',on=['STATE_FIPS','CNTY_FIPS'])
    
        # calculate and normalize the area on which the crop is grown
        try:
            crop_area = ag_copy[colname + '_' + str(icrop)].values
        except: 
            crop_area = np.zeros((ag_copy.shape[0],)) # area is missing for a few crops for the alternative area choices

        crop_area[np.isnan(crop_area)] = 0 # replace NaN values
        total_area = np.sum(crop_area) 
        crop_area = crop_area / total_area # weights for average
        
        crop_out.loc[icrop,'log_total_area'] = np.log(total_area) # Store total area
 
        # Loop over each cutoff temperature considered
        for i in range(len(temps)):
            
            # GDD Calculation
            temp_name = temps[i]
            column_name = 'gdd_' + names[i] # For the output
            
            cutoff_temp = int(crop_out.loc[icrop,temp_name])
            cutoff_temp = max(cutoff_temp,10)
            cutoff_temp = min(cutoff_temp,45) # bounds
            gdd_name = 'dday_' + str(int(cutoff_temp*100)) # Column name for GDD above X
            gdd_data = ag_copy[gdd_name] # Data for GDD above X
            gdd_hasdata = ~pd.isna(gdd_data)
            # Store the weighted average, over counties where GDD are reported
            crop_out.loc[icrop,column_name+'_'+str(year)] = np.dot(crop_area[gdd_hasdata],gdd_data[gdd_hasdata]) / np.sum(crop_area[gdd_hasdata])
           
                           
        
            # Days above threshold calculation
            day_name = 'above_' + str(int(cutoff_temp*100))
            column_name = 'days_' + names[i] # Column name for days above X
            day_data = ag_copy[day_name]
            day_hasdata = ~pd.isna(day_data)
            crop_out.loc[icrop,column_name + '_'+str(year)] = np.dot(crop_area[day_hasdata],day_data[day_hasdata]) / np.sum(crop_area[day_hasdata])
           
            # For the max_opt_temperature, our main cutoff, also store the county_x_crop data
            # This data frame is basically a relabeling of the dday dataframe, but 
            # indexed by crop names
            if temp_name == "max_opt_temp":
                cnty_out.loc[:,str(icrop) + '_daysHot_' + str(year)] = ag_copy[day_name]
                cnty_out.loc[:,str(icrop) + '_gddHot_' + str(year)] = ag_copy[gdd_name]

        # Also save days over 30 degrees C, as a benchmark
        gdd_data = ag_copy['dday_3000']
        gdd_hasdata = ~pd.isna(gdd_data)
        crop_out.loc[icrop,'gdd30_'+str(year)] = np.dot(crop_area[gdd_hasdata],gdd_data[gdd_hasdata]) / np.sum(crop_area[gdd_hasdata])

        # Also save number of days of extreme cold 
        total_days = 10* (30 + 31 + 30 + 31 + 31 + 30 + 31) # Total days in growing season, per decade
        crop_out.loc[icrop,'coldDays_'+str(year)] = total_days - crop_out.loc[icrop,'days_minOpt_'+str(year)]

##
## Save data
##

# Drop any crops if they did not have any area
crop_out = crop_out[crop_out.log_total_area > -np.inf]
# Save output
crop_out.to_csv(name_crops + '.csv')
cnty_out.to_csv(name_counties + '.csv', index = False)
